rm(list=ls())
library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
library(tigris)
library(sf)
This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:
| Original Variable Name | New Variable Name | Definition |
|---|---|---|
| State | State | state |
| County | County_Name | County |
| FIPS | FIPS_County_Code | 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity) |
| Year | Year | Year when the data was collected |
| Tier | Affordability | Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all |
| Median.Sale.Price..in.Thousands. | Median_Sale_Price_in_k | Median sale price in thousands of USD per county |
| House.Price.Appreciation.since.2012 | House_Price_Appreciation_since_2012_percent | Cumulative home price appreciation since 2012 |
| House.Price.Appreciation..Year.over.Year | House_Price_Appreciation_yr_over_yr_percent | Home price appreciation since the previous year |
| Months..Supply | Months_Supply | Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace |
| New.Construction.Share.of.Sales | New_Constr_by_share_of_sales_percent | Percent of sales comprising new construction |
| Mortgage.Default.Rate | Mortgage_Default_Rate_percent | AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk) |
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
## State County FIPS Year Tier Median.Sale.Price..in.Thousands.
## 1 LA Claiborne Parish 22027 2013 moveup 245
## 2 MA Dukes County 25007 2020 moveup 925
## 3 AL Madison County 1089 2020 all 229
## 4 TX Coryell County 48099 2013 moveup 208
## 5 MI Otsego County 26137 2020 entrylevel 118
## House.Price.Appreciation.since.2012 House.Price.Appreciation..Year.over.Year.
## 1 -4.09999862313271% -4.10000011324883%
## 2 61.6999983787537% 7.10000023245812%
## 3 49.8000025749207% 12.7000004053116%
## 4 -3.19999679923058% -3.20000015199184%
## 5 73.5000014305115% 11.8000000715256%
## Months..Supply New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1 NA 0%
## 2 5.4 7.99999982118607% 5.20000010728836%
## 3 1.2 25.2000004053116% 11.4000000059605%
## 4 6.2 26.4999985694885% 17.0000001788139%
## 5 1.7 1.09999999403954% 13.6000007390976%
The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.
housing_2024 = housing %>% filter(housing$Year == 2024,
housing$State != 'AA National',
housing$County != 'AA State') %>%
#moving this na removal to the end after the cleaning
na.omit %>%
#excluding the armed forces
#rename cols
rename(
Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
Months_Supply = Months..Supply,
New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
County_Name = County,
FIPS_County_Code = FIPS,
Affordability = Tier
)
head(housing_2024)
## State County_Name FIPS_County_Code Year Affordability
## 7 AK Anchorage Municipality 2020 2024 all
## 8 AK Anchorage Municipality 2020 2024 entrylevel
## 9 AK Anchorage Municipality 2020 2024 moveup
## 10 AK Bethel Census Area 2050 2024 all
## 11 AK Bethel Census Area 2050 2024 entrylevel
## 12 AK Bethel Census Area 2050 2024 moveup
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 7 414 51.8999934196472%
## 8 325 49.6999979019165%
## 9 581 54.1999995708466%
## 10 389
## 11 350
## 12 516
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 7 4.50000017881393% 2.5
## 8 3.99999991059303% 2.2
## 9 5.00000007450581% 2.8
## 10 4.8
## 11 5.3
## 12 3.2
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 7 4.39999997615814% 12.3999997973442%
## 8 1.89999993890524% 13.6000007390976%
## 9 7.90000036358833% 9.79999974370003%
## 10 11.6999998688698% 16.2000000476837%
## 11 0% 4.30000014603138%
## 12 60.9000027179718% 22.2000002861023%
The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.
# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)
# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
mutate(Median_Sale_Price_per_k = gsub("[\\$,]", "", Median_Sale_Price_per_k),
House_Price_Appreciation_since_2012_percent =
gsub("[%,]","",House_Price_Appreciation_since_2012_percent),
House_Price_Appreciation_yr_over_yr_percent =
gsub("[%,]","",House_Price_Appreciation_yr_over_yr_percent),
New_Constr_by_share_of_sales_percent = gsub("[%,]","",New_Constr_by_share_of_sales_percent),
Mortgage_Default_Rate_percent = gsub("[%,]","",Mortgage_Default_Rate_percent)
)
###I included the commas to be removed as well indicated in the brackets. SA
# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent =
as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)
###I included the commas to be removed as well indicated in the brackets. SA
#moving the na dropping to after the parsing
housing_2024 <- housing_2024 %>%
tidyr::drop_na(Median_Sale_Price_per_k)
# For some reason is rounding the data in quite a weird way—inaccurately
# view data
housing_2024 %>% slice_sample(n=5)
## State County_Name FIPS_County_Code Year Affordability
## 1 ID Lewis County 16061 2024 all
## 2 SD Minnehaha County 46099 2024 all
## 3 CA Los Angeles County 6037 2024 entrylevel
## 4 OR Clackamas County 41005 2024 entrylevel
## 5 IA Clay County 19041 2024 entrylevel
## Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1 233 173
## 2 308 118
## 3 640 174
## 4 450 150
## 5 122 72
## House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1 -10.7 2.7
## 2 3.6 4.5
## 3 4.1 3.3
## 4 2.1 2.9
## 5 1.1 2.7
## New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1 2.7 13.0
## 2 25.3 8.0
## 3 4.6 13.1
## 4 9.8 11.8
## 5 3.6 13.1
#creating df w/ affordability all totals and then another with entry level/moving up totals
housing_2024_all = housing_2024 %>% filter(housing_2024$Affordability == "all")
housing_2024_tiers = housing_2024 %>% filter(Affordability == "entrylevel" | Affordability == "moveup")
#BW- we need to explain why we did this (not in the comments, in the write-up and presentation). Steph?
xkablesummary(housing_2024)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 620 | Washington County: 80 | 1001 : 3 | Min. :2024 | all :2751 | Min. : 14 | Min. : 3.7 | Min. :-32.90 | Min. : 0.00 | Min. : 0.0 | Min. : 0.2 |
| Q1 | GA : 477 | Jefferson County : 72 | 1003 : 3 | 1st Qu.:2024 | entrylevel:2741 | 1st Qu.: 148 | 1st Qu.: 86.6 | 1st Qu.: 2.80 | 1st Qu.: 2.20 | 1st Qu.: 2.7 | 1st Qu.:10.2 |
| Median | KY : 340 | Franklin County : 63 | 1005 : 3 | Median :2024 | moveup :2669 | Median : 260 | Median :105.2 | Median : 5.70 | Median : 3.10 | Median : 6.9 | Median :13.9 |
| Mean | VA : 319 | Lincoln County : 60 | 1007 : 3 | Mean :2024 | NA | Mean : 293 | Mean :110.7 | Mean : 5.88 | Mean : 4.11 | Mean : 10.7 | Mean :14.3 |
| Q3 | NC : 300 | Jackson County : 57 | 1009 : 3 | 3rd Qu.:2024 | NA | 3rd Qu.: 391 | 3rd Qu.:129.6 | 3rd Qu.: 8.70 | 3rd Qu.: 5.00 | 3rd Qu.: 15.0 | 3rd Qu.:17.5 |
| Max | IA : 297 | Madison County : 57 | 1017 : 3 | Max. :2024 | NA | Max. :4400 | Max. :279.1 | Max. : 89.20 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):5808 | (Other) :7772 | (Other):8143 | NA | NA | NA | NA’s :1253 | NA’s :1280 | NA | NA | NA’s :1036 |
#separated df's
xkablesummary(housing_2024_all)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 217 | Washington County: 27 | 1001 : 1 | Min. :2024 | all :2751 | Min. : 17 | Min. : 3.7 | Min. :-28.70 | Min. : 0.10 | Min. : 0.00 | Min. : 0.5 |
| Q1 | GA : 159 | Jefferson County : 24 | 1003 : 1 | 1st Qu.:2024 | entrylevel: 0 | 1st Qu.: 140 | 1st Qu.: 88.9 | 1st Qu.: 3.00 | 1st Qu.: 2.20 | 1st Qu.: 2.90 | 1st Qu.:11.4 |
| Median | KY : 114 | Franklin County : 21 | 1005 : 1 | Median :2024 | moveup : 0 | Median : 200 | Median :107.3 | Median : 5.95 | Median : 3.00 | Median : 6.40 | Median :14.5 |
| Mean | VA : 107 | Lincoln County : 20 | 1007 : 1 | Mean :2024 | NA | Mean : 242 | Mean :113.4 | Mean : 6.00 | Mean : 3.44 | Mean : 9.51 | Mean :14.8 |
| Q3 | NC : 100 | Jackson County : 19 | 1009 : 1 | 3rd Qu.:2024 | NA | 3rd Qu.: 303 | 3rd Qu.:132.6 | 3rd Qu.: 8.60 | 3rd Qu.: 4.10 | 3rd Qu.:12.50 | 3rd Qu.:17.6 |
| Max | IA : 99 | Madison County : 19 | 1017 : 1 | Max. :2024 | NA | Max. :2300 | Max. :278.0 | Max. : 89.20 | Max. :24.00 | Max. :75.40 | Max. :36.0 |
| NA | (Other):1955 | (Other) :2621 | (Other):2745 | NA | NA | NA | NA’s :293 | NA’s :301 | NA | NA | NA’s :323 |
xkablesummary(housing_2024_tiers)
| State | County_Name | FIPS_County_Code | Year | Affordability | Median_Sale_Price_per_k | House_Price_Appreciation_since_2012_percent | House_Price_Appreciation_yr_over_yr_percent | Months_Supply | New_Constr_by_share_of_sales_percent | Mortgage_Default_Rate_percent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | TX : 403 | Washington County: 53 | 1001 : 2 | Min. :2024 | all : 0 | Min. : 14 | Min. : 4.5 | Min. :-32.90 | Min. : 0.00 | Min. : 0.0 | Min. : 0.2 |
| Q1 | GA : 318 | Jefferson County : 48 | 1003 : 2 | 1st Qu.:2024 | entrylevel:2741 | 1st Qu.: 155 | 1st Qu.: 85.4 | 1st Qu.: 2.70 | 1st Qu.: 2.10 | 1st Qu.: 2.6 | 1st Qu.: 9.7 |
| Median | KY : 226 | Franklin County : 42 | 1005 : 2 | Median :2024 | moveup :2669 | Median : 314 | Median :104.4 | Median : 5.60 | Median : 3.30 | Median : 7.3 | Median :13.6 |
| Mean | VA : 212 | Lincoln County : 40 | 1007 : 2 | Mean :2024 | NA | Mean : 319 | Mean :109.2 | Mean : 5.82 | Mean : 4.45 | Mean : 11.3 | Mean :14.0 |
| Q3 | NC : 200 | Jackson County : 38 | 1009 : 2 | 3rd Qu.:2024 | NA | 3rd Qu.: 419 | 3rd Qu.:128.3 | 3rd Qu.: 8.70 | 3rd Qu.: 5.60 | 3rd Qu.: 16.1 | 3rd Qu.:17.5 |
| Max | IA : 198 | Madison County : 38 | 1017 : 2 | Max. :2024 | NA | Max. :4400 | Max. :279.1 | Max. : 86.80 | Max. :24.00 | Max. :100.0 | Max. :36.0 |
| NA | (Other):3853 | (Other) :5151 | (Other):5398 | NA | NA | NA | NA’s :960 | NA’s :979 | NA | NA | NA’s :713 |
#boxplotting med sale price by state w/ ..._all df
#all
ggplot(housing_2024_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Median Sale Prices by State (2024)",
x = "State",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.
The states broken out by housing count extremes are:
####Top Months Supply States
HI | Hawaii DC | District of Columbia FL | Florida MT | Montana CO |
Colorado
####Bottom Months Supply States MA| Massachusetts WI | Wisconsin KY | Kentucky NH | New Hampshire IN | Indiana ND | North Dakota OH | Ohio
#boxplotting with new df: ..._all
#Find top/bottom 5 by *mean Months Supply*, not row count
states_by_mean_months_supply <- housing_2024_all %>%
group_by(State) %>%
summarize(Mean_Months_Supply = mean(Months_Supply))
top_states_all = states_by_mean_months_supply %>% slice_max(order_by = Mean_Months_Supply, n = 5)
bottom_states_all <- states_by_mean_months_supply %>% slice_min(order_by = Mean_Months_Supply, n = 5)
#merge top and bottom states
housing_compare_all <- housing_2024_all %>%
filter(State %in% c(top_states_all$State, bottom_states_all$State)) %>%
mutate(StateGroup = case_when(
State %in% top_states_all$State ~ "High Months Supply",
State %in% bottom_states_all$State ~ "Low Months Supply")) %>%
mutate(StateGroup = factor(StateGroup, levels = c("Low Months Supply", "High Months Supply")))
#plot top bottom comparison ####
ggplot(housing_compare_all, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
geom_boxplot() +
facet_wrap(~ StateGroup, scales = "free_x") +
labs(
title = "Mean Sale Price in States by Months Supply of Housing",
subtitle = "The Mean Sale Price in States with a Larger Months Supply of Housing is Significantly Lower\nthan States with a Smaller Months Supply of Housing",
x = "State",
y = "Mean Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal() +
scale_fill_manual(values = c("High Months Supply" = "skyblue", "Low Months Supply" = "red"))
#states_mean_sale_price_anova = aov(data = housing_compare_all)
# compare color scale w/ ..._all df
#fix color assigning for side by side compare scatterplots- too many blue showing up
#cleaned plot version with dropped unused (non-10) state levels; re-group by state for clarity
housing_compare_all_plot <- housing_compare_all %>%
group_by(State, StateGroup) %>%
summarize(
Median_Sale_Price_per_k = median(Median_Sale_Price_per_k, na.rm = TRUE),
Months_Supply = median(Months_Supply, na.rm = TRUE),
.groups = "drop") %>%
mutate(State = forcats::fct_drop(State))
#also fixing subset of housing 2024 all so that gray background dots are 1/state
housing_2024_all_plot <- housing_2024_all %>%
group_by(State) %>%
summarize(
Median_Sale_Price_per_k = median(Median_Sale_Price_per_k, na.rm = TRUE),
Months_Supply = median(Months_Supply, na.rm = TRUE),
.groups = "drop")
#label points for 10 states
label_points_all <- housing_compare_all_plot
#usig cleaned version of plot data to assign colors
top_states_colors_all <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "High Months Supply"]))))
names(top_states_colors_all) <- unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "High Months Supply"])
bottom_states_colors_all <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "Low Months Supply"]))))
names(bottom_states_colors_all) <- unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "Low Months Supply"])
state_colors_all <- c(top_states_colors_all, bottom_states_colors_all)
#all in gray with faceted compare in color with states labeled ####
ggplot() +
geom_point(data = housing_2024_all_plot, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1.5) +
geom_point(data = housing_compare_all_plot,
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
size = 1.5, alpha = 0.9) +
geom_text_repel(data = label_points_all,
aes(x = Months_Supply, y = Median_Sale_Price_per_k,
label = State),
size = 3.5, color= "black", stroke=0.01, segment.color = NA, segment.size = 0.3,
segment.alpha = 1, min.segment.length = 0, show.legend = FALSE) +
facet_wrap(~ StateGroup) +
scale_color_manual(values = state_colors_all) +
labs(
title = "Months Supply vs Median Price by State",
subtitle = "The association between Low Months Supply and High Median Sale Price is Immediately Visible, While the",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#boxplotting med sale price by state w/ ..._compare_all df
#all
ggplot(housing_compare_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = state_colors_all, alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Median Sale Prices by State (2024)",
x = "State",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#hist plotting with df ..._all
ggplot(housing_2024_all, aes(x = Median_Sale_Price_per_k)) +
geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
labs(
title = "Distribution of Median Sale Prices (2024)",
x = "Median Sale Price (in thousands)",
y = "Count",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Affordability Tier (2024)",
x = "Affordability Tier",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#no duh, there are more unpurchased expensive houses because people can't afford it
#not sure how useful this is, but maybe as a starting baseline
ggplot(housing_2024_all_plot, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
geom_point(color="steelblue", alpha = 0.7) +
# geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Months Supply of Housing vs Median Sale Price",
subtitle= "A Scatterplot Reveals a Positive Correlation Between Months Supply\nof Housing and Median Sale Price",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#next round, add line of best fit/ regression line
#correlation heatmap of numeric variables w/ ..._all df
housing_numeric_all <- housing_2024_all %>%
select(where(is.numeric) & !all_of("Year")) %>%
drop_na()
cor_matrix_all <- cor(housing_numeric_all)
corrplot(cor_matrix_all, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)
#interesting to see positive and inverse relationships between variables
#did not scan for no relationships
#both together in gray with compare in color with states labeled w/ ..._all df
ggplot() +
geom_point(data = housing_2024_all,
aes(x = Months_Supply, y = Median_Sale_Price_per_k),
color = "gray70", alpha = 0.3, size = 1) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Most Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
# geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
# aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
# alpha = 0.8, size = 1) +
geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Fewest Houses"),
aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
alpha = 0.5, size = 2) +
scale_color_manual(values = state_colors_all) +
labs(
title = "Housing Supply vs Median Price: All Counties with Highlights",
subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
x = "Months of Supply",
y = "Median Sale Price (in thousands)",
color = "State",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
housing_constr = housing_2024
housing_constr = housing_constr %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
#I would like to know which states fall into these buckets. In which states are the top quarter and bottom quarter? BW
#I'm going to break this into smaller buckets and investigate.BW
#new construction vs median sale price w/ ..._all df
housing_constr_all = housing_2024_all
housing_constr_all = housing_constr_all %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(housing_constr_all, aes(x = constr_bins, y = Median_Sale_Price_per_k, fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
housing_constr_tiers = housing_2024_tiers
housing_constr_tiers = housing_constr_tiers %>%
mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))
ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
# Even more pronounced here
ggplot(filter(housing_constr,Affordability=="moveup"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
geom_boxplot() +
labs(
title = "Median Sale Price by Percent New Construction (2024)",
x = "New Construction by Share of Sales Percent",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
# Even more pronounced here
###mapping US counties by median sale price
##basemap
#reading in counties spatial file from tigris package
counties_2024 <- counties(year = 2013,
cb = TRUE,
class = "sf")
## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
#excluding territories and shifting AK/HI
counties_2024 <- counties_2024 %>%
filter(as.numeric(STATEFP) < 60) %>%
shift_geometry()
#removing duplicate columns
counties_2024 <- counties_2024 %>%
select(-NAME)
#creating basemap and checking
counties_2024_basemap <- ggplot() +
geom_sf(data = counties_2024) +
theme_void()
counties_2024_basemap
##prepping and merging housing_2014_all w/ counties_2014
housing_2024_all <- housing_2024_all %>%
mutate(price_bin = cut(Median_Sale_Price_per_k,
breaks = c(0, 150, 300, 500, 750, 1000, Inf),
labels = c("<150k", "150-300k", "300-500k", "500-750k", "750k-1M", ">1M")))
#creating merge df and prepping counties df
housing_merge <- housing_2024_all
counties_2024 <- counties_2024 %>%
mutate(GEOID = as.numeric(GEOID))
#adding id variables to track observations
counties_2024$id1 <- 1
housing_merge$id2 <- 1
#merging
housing_counties_merge <- merge(x = counties_2024,
y = housing_merge,
by.x = "GEOID",
by.y = "FIPS_County_Code",
all = TRUE)
#converting NAs to zeros and finding problem observations
#looks like it merged just fine
#i lied, coded wrong and fixed, now i realize theres a
#lot of problem observations
housing_counties_merge <- housing_counties_merge %>%
mutate(id1 = ifelse(is.na(id1), 0, id1),
id2 = ifelse(is.na(id2), 0, id2))
no_merge <- housing_counties_merge %>%
filter(id1 + id2 != 2)
unique(no_merge$GEOID)
## [1] 1011 1013 1015 1023 1035 1041 1049 1055 1063 1071 1075 1079
## [13] 1081 1087 1093 1105 1107 1123 2013 2016 2068 2070 2105 2164
## [25] 2170 2185 2188 2198 2240 2261 2270 2275 2282 2290 5049 5077
## [37] 9013 15005 17003 17013 17045 17055 17061 17071 17123 17147 17155 17159
## [49] 17169 17171 17185 20003 20005 20007 20009 20019 20021 20023 20025 20027
## [61] 20029 20031 20033 20035 20039 20041 20043 20047 20049 20051 20053 20057
## [73] 20059 20063 20065 20067 20071 20075 20077 20079 20081 20083 20085 20089
## [85] 20093 20095 20097 20099 20101 20105 20107 20109 20113 20115 20117 20119
## [97] 20123 20125 20127 20129 20131 20133 20135 20137 20139 20141 20143 20145
## [109] 20147 20151 20153 20155 20157 20159 20161 20163 20165 20167 20171 20175
## [121] 20179 20181 20183 20185 20187 20189 20191 20193 20195 20199 20201 20203
## [133] 20205 20207 21095 21099 21105 21127 21129 21133 22021 22023 22025 22029
## [145] 22035 22039 22041 22043 22067 22083 22091 22093 22095 22107 22123 26009
## [157] 26029 26063 26075 26091 26095 26097 26113 26133 26153 28001 28003 28005
## [169] 28007 28011 28015 28019 28021 28023 28027 28031 28037 28041 28043 28049
## [181] 28051 28053 28055 28057 28061 28063 28065 28067 28069 28077 28083 28095
## [193] 28097 28099 28107 28115 28119 28125 28131 28135 28141 28143 28145 28149
## [205] 28157 28159 28161 28163 29001 29003 29015 29017 29057 29065 29081 29083
## [217] 29085 29087 29089 29093 29103 29113 29129 29139 29171 29177 29197 29199
## [229] 29221 29227 30003 30005 30011 30015 30017 30019 30025 30033 30037 30041
## [241] 30065 30071 30079 30083 30085 30091 30095 30099 30103 30107 30109 31053
## [253] 31075 31115 31165 32009 35011 35017 35021 35023 35059 38085 42021 45067
## [265] 46003 46007 46009 46011 46013 46015 46017 46023 46025 46031 46035 46037
## [277] 46041 46045 46047 46049 46051 46053 46055 46057 46059 46061 46063 46069
## [289] 46071 46075 46077 46079 46081 46083 46085 46087 46089 46091 46093 46095
## [301] 46097 46101 46105 46107 46113 46117 46119 46121 46123 46125 46129 46137
## [313] 48017 48023 48033 48045 48081 48095 48101 48111 48117 48125 48155 48173
## [325] 48195 48207 48243 48247 48261 48267 48269 48271 48275 48301 48305 48311
## [337] 48327 48333 48341 48345 48371 48389 48393 48411 48417 48445 48447 48455
## [349] 48483 49009 49015 49017 49019 49023 49031 49037 49055 51005 51515 51520
## [361] 51530 51540 51570 51580 51595 51600 51610 51620 51630 51640 51660 51670
## [373] 51678 51683 51685 51690 51720 51750 51770 51775 51790 51820 51830 51840
## [385] 53019 53039 53075 54003 54039 54093 56017 56027
#plotting and checking
housing_counties_plot <- ggplot() +
geom_sf(data = housing_counties_merge,
mapping = aes(fill = Median_Sale_Price_per_k),
color = "white",
linewidth = .4) +
#scale_fill_brewer(palette = "GnBi") +
theme(legend.position = "none") +
theme_void()
housing_counties_plot
#for fun: trying st_simplify to see if it looks better
housing_counties_merge_simple <- st_simplify(housing_counties_merge, dTolerance = 75)
housing_counties_plot2 <- ggplot() +
geom_sf(data = housing_counties_merge_simple,
mapping = aes(fill = price_bin),
color = "white",
linewidth = .4) +
scale_fill_brewer(palette = "RdYlBu",
direction = -1,
name = "Median Sale Price",
na.value = "grey85") +
labs(title = "Geographic Variation in U.S. Home Prices, 2024",
subtitle = "County-level median sale price",
caption = "Source: AEI Housing Center (2024)") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold",
size = 16),
plot.subtitle = element_text(size = 12,
margin = margin(b = 8)),
plot.caption = element_text(color = "gray40",
margin = margin(t = 8)),
legend.title = element_text(face = "bold"),
legend.position = "right",
legend.key.height = unit(0.6, "cm"),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
plot.margin = margin(8, 12, 8, 12))
#+ theme_void()
housing_counties_plot2
####looks like there are just missing data for counties as they show up as white or gray in my map. guessing it has something to do with the removing na's code not working that alex and i were wondering about. will need to get to the bottom of this.
####i could of course leave the color gradient and show the concentration or i could bin ranges of prices to different colors on the gradient scale. perhaps by affordability or something else that's useful? want to convene with team.
#BW: I think bins and/or a more drastic color scale could help readability. Like- dark purple, medium pink, orangey-yellow would be immediately visibly different
####i plan to add useful titles, labels, and commentary where necessary when i work on this next.
#BW: when the regional differences are more clear, we need to think about the story this map tells and how it fits into our overall narrative.
#hist of months supply
#hist plotting months supply with df ...top_states_all
ggplot(housing_compare_all, aes(x = reorder(State, Months_Supply, median),
y = Months_Supply)) +
geom_boxplot(fill = state_colors_all, alpha = 0.7) +
coord_flip() +
labs(
title = "Distribution of Months Supply (2024)",
x = "State",
y = "Months Supply",
caption = "SOURCE: American Enterprise Institute") +
theme_minimal()
#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me
#z score
#confidence interval (on what tho, maybe median sale price and months supply)
#then add Zscore and CI onto scatterplot somehow?
ggplot(housing_compare_all,
aes(x = reorder(State, Months_Supply, median),
y = Median_Sale_Price_per_k)) +
geom_boxplot(fill = state_colors_all, alpha = 0.7) +
coord_flip() +
labs(
title = "Median Sale Price by State Ordered by Months of Supply (2024)",
x = "State (ordered by median Months of Supply)",
y = "Median Sale Price (in thousands)",
caption = "SOURCE: American Enterprise Institute"
) +
theme_minimal()
# Count number of observations (homes / county entries) per state
state_counts <- housing_compare_all %>%
group_by(State) %>%
summarise(
n_entries = n(),
median_MoS = median(Months_Supply, na.rm = TRUE),
median_price = median(Median_Sale_Price_per_k, na.rm = TRUE)
) %>%
arrange(desc(n_entries))
knitr::kable(state_counts, caption = "Count of Observations by State (2024)")
| State | n_entries | median_MoS | median_price |
|---|---|---|---|
| KY | 114 | 2.10 | 131 |
| IN | 92 | 2.30 | 176 |
| OH | 88 | 2.30 | 170 |
| WI | 72 | 1.90 | 225 |
| FL | 67 | 5.30 | 310 |
| CO | 64 | 4.80 | 370 |
| UT | 21 | 4.60 | 426 |
| NH | 10 | 2.25 | 401 |
| HI | 4 | 7.65 | 860 |
| DC | 1 | 6.50 | 635 |
# Count number of observations (homes / county entries) per state
state_counts <- housing_2024_all %>%
group_by(State) %>%
summarise(
n_entries = n(),
median_MoS = median(Months_Supply, na.rm = TRUE),
median_price = median(Median_Sale_Price_per_k, na.rm = TRUE)
) %>%
arrange(desc(n_entries))
knitr::kable(state_counts, caption = "Count of Observations by State (2024)")
| State | n_entries | median_MoS | median_price |
|---|---|---|---|
| TX | 217 | 4.10 | 206 |
| GA | 159 | 3.30 | 194 |
| KY | 114 | 2.10 | 131 |
| VA | 107 | 3.00 | 270 |
| NC | 100 | 3.75 | 254 |
| IA | 99 | 3.00 | 145 |
| TN | 95 | 3.40 | 215 |
| MO | 93 | 2.40 | 194 |
| IN | 92 | 2.30 | 176 |
| IL | 89 | 2.30 | 120 |
| NE | 89 | 2.40 | 155 |
| OH | 88 | 2.30 | 170 |
| MN | 87 | 2.80 | 215 |
| OK | 77 | 3.80 | 135 |
| AR | 73 | 3.40 | 130 |
| MI | 73 | 2.60 | 168 |
| WI | 72 | 1.90 | 225 |
| FL | 67 | 5.30 | 310 |
| PA | 66 | 2.50 | 181 |
| CO | 64 | 4.80 | 370 |
| NY | 62 | 3.10 | 194 |
| CA | 58 | 3.75 | 442 |
| ND | 52 | 2.30 | 120 |
| WV | 52 | 2.50 | 108 |
| AL | 49 | 3.10 | 162 |
| LA | 49 | 3.80 | 161 |
| SC | 45 | 3.80 | 220 |
| ID | 44 | 3.70 | 340 |
| MS | 39 | 2.30 | 166 |
| OR | 36 | 4.65 | 384 |
| WA | 36 | 3.90 | 415 |
| MT | 35 | 4.20 | 336 |
| NM | 28 | 2.75 | 238 |
| MD | 24 | 2.95 | 389 |
| KS | 22 | 2.60 | 207 |
| NJ | 21 | 3.10 | 520 |
| UT | 21 | 4.60 | 426 |
| WY | 21 | 3.60 | 279 |
| SD | 18 | 2.35 | 208 |
| ME | 16 | 2.45 | 255 |
| NV | 16 | 2.75 | 312 |
| AZ | 15 | 3.80 | 275 |
| MA | 14 | 1.45 | 622 |
| VT | 14 | 2.75 | 309 |
| AK | 13 | 3.00 | 389 |
| NH | 10 | 2.25 | 401 |
| CT | 7 | 3.00 | 349 |
| RI | 5 | 2.60 | 584 |
| HI | 4 | 7.65 | 860 |
| DE | 3 | 3.90 | 350 |
| DC | 1 | 6.50 | 635 |